yelp_business = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_business.csv', sep =',', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_business.write.saveAsTable("yelp_business", mode="overwrite")
yelp_business.printSchema()
yelp_review = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_review_tab.csv', sep ='\t', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_review.write.saveAsTable("yelp_review", mode="overwrite")
yelp_review.printSchema()
yelp_review.count()
# Subset open businesses and safe as temp table using pyspark
df_business = spark.sql("""
SELECT name, latitude, longitude, stars from yelp_business WHERE is_open IN (1)
""")
df_business.registerTempTable('df_business')
# Number of open bussinesses
df_business.count()
df_business.head(1)
%%sql -o buss_open
/* Subset open businesses and safe as temp table using magic sql command */
SELECT name, latitude, longitude, stars from yelp_business WHERE is_open IN (1)
%%local
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)
scl = [ [0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
[0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"] ]
data = [ dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = buss_open['longitude'],
lat = buss_open['latitude'],
text = buss_open['name'],
mode = 'markers',
marker = dict(
size = 8,
opacity = 0.8,
reversescale = True,
autocolorscale = False,
symbol = 'square',
line = dict(
width=1,
color='rgba(102, 102, 102)'
),
colorscale = scl,
cmin = 0,
color = buss_open['stars'],
cmax = buss_open['stars'].max(),
colorbar=dict(
title="Ratings"
)
))]
layout = dict(
title = 'Yelp business',
colorbar = True,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = "rgb(250, 250, 250)",
subunitcolor = "rgb(217, 217, 217)",
countrycolor = "rgb(217, 217, 217)",
countrywidth = 0.5,
subunitwidth = 0.5
),
)
fig = dict( data=data, layout=layout )
iplot(fig, validate=False, filename='business_on_map')
yelp_review.groupBy("stars").count().show()
%%sql
SELECT stars, COUNT(*) as N FROM yelp_review GROUP BY stars